{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import confusion_matrix\n",
    "import nltk\n",
    "from nltk.corpus import stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of reviews: 25000\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5814_8</td>\n",
       "      <td>1</td>\n",
       "      <td>With all this stuff going down at the moment w...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2381_9</td>\n",
       "      <td>1</td>\n",
       "      <td>\"The Classic War of the Worlds\" by Timothy Hin...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7759_3</td>\n",
       "      <td>0</td>\n",
       "      <td>The film starts with a manager (Nicholas Bell)...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3630_4</td>\n",
       "      <td>0</td>\n",
       "      <td>It must be assumed that those who praised this...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9495_8</td>\n",
       "      <td>1</td>\n",
       "      <td>Superbly trashy and wondrously unpretentious 8...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id  sentiment                                             review\n",
       "0  5814_8          1  With all this stuff going down at the moment w...\n",
       "1  2381_9          1  \"The Classic War of the Worlds\" by Timothy Hin...\n",
       "2  7759_3          0  The film starts with a manager (Nicholas Bell)...\n",
       "3  3630_4          0  It must be assumed that those who praised this...\n",
       "4  9495_8          1  Superbly trashy and wondrously unpretentious 8..."
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 使用pandas读入训练数据\n",
    "datafile = os.path.join('.','datas','labeledTrainData.tsv')\n",
    "df = pd.read_csv(datafile,sep=\"\\t\",escapechar=\"\\\\\")\n",
    "print(\"Number of reviews: {}\".format(len(df)))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally starts is only on for 20 minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord. Why he wants MJ dead so bad is beyond me. Because MJ overheard his plans? Nah, Joe Pesci's character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates MJ's music.<br /><br />Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence. Also, the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.<br /><br />Bottom line, this movie is for people who like MJ on one level or another (which i think is most people). If not, then stay away. It does try and give off a wholesome message and ironically MJ's bestest buddy in this movie is a girl! Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty? Well, with all the attention i've gave this subject....hmmm well i don't know because people can be different behind closed doors, i know this for a fact. He is either an extremely nice but stupid guy or one of the most sickest liars. I hope he is not the latter.\""
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['review'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'classic war worlds timothy hines entertaining film obviously goes great effort lengths faithfully recreate h g wells classic book mr hines succeeds watched film appreciated fact standard predictable hollywood fare comes every year e g spielberg version tom cruise slightest resemblance book obviously everyone looks different things movie envision amateur critics look criticize everything others rate movie important bases like entertained people never agree critics enjoyed effort mr hines put faithful h g wells classic novel found entertaining made easy overlook critics perceive shortcomings'"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "对影评数做预处理，大概有一下环节：\n",
    "0,归一化处理，全部转为小写\n",
    "1，去掉html标签\n",
    "2，移除标点\n",
    "3，切分词/token\n",
    "4,去掉停用词\n",
    "5，重组为新的句子\n",
    "\"\"\"\n",
    "def clean_sentence(sentence):\n",
    "    sentence = sentence.lower() #0,归一化处理，全部转为小写\n",
    "    sentence = BeautifulSoup(sentence,'html.parser').get_text()#1，去掉html标签\n",
    "    sentence = re.sub('[\\W]',' ',sentence)#2，移除标点\n",
    "    tokens = nltk.word_tokenize(sentence)#3，切分词/token\n",
    "    stop_words = set(stopwords.words('english'))\n",
    "    sentence_words = [i for i in sentence.split() if i not in stop_words]#4,去掉停用词\n",
    "    return ' '.join(sentence_words)#5，重组为新的句子\n",
    "\n",
    "example = clean_sentence(df['review'][1])\n",
    "example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>review</th>\n",
       "      <th>cleaned_review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5814_8</td>\n",
       "      <td>1</td>\n",
       "      <td>With all this stuff going down at the moment w...</td>\n",
       "      <td>stuff going moment mj started listening music ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2381_9</td>\n",
       "      <td>1</td>\n",
       "      <td>\"The Classic War of the Worlds\" by Timothy Hin...</td>\n",
       "      <td>classic war worlds timothy hines entertaining ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7759_3</td>\n",
       "      <td>0</td>\n",
       "      <td>The film starts with a manager (Nicholas Bell)...</td>\n",
       "      <td>film starts manager nicholas bell giving welco...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3630_4</td>\n",
       "      <td>0</td>\n",
       "      <td>It must be assumed that those who praised this...</td>\n",
       "      <td>must assumed praised film greatest filmed oper...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9495_8</td>\n",
       "      <td>1</td>\n",
       "      <td>Superbly trashy and wondrously unpretentious 8...</td>\n",
       "      <td>superbly trashy wondrously unpretentious 80 ex...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id  sentiment                                             review  \\\n",
       "0  5814_8          1  With all this stuff going down at the moment w...   \n",
       "1  2381_9          1  \"The Classic War of the Worlds\" by Timothy Hin...   \n",
       "2  7759_3          0  The film starts with a manager (Nicholas Bell)...   \n",
       "3  3630_4          0  It must be assumed that those who praised this...   \n",
       "4  9495_8          1  Superbly trashy and wondrously unpretentious 8...   \n",
       "\n",
       "                                      cleaned_review  \n",
       "0  stuff going moment mj started listening music ...  \n",
       "1  classic war worlds timothy hines entertaining ...  \n",
       "2  film starts manager nicholas bell giving welco...  \n",
       "3  must assumed praised film greatest filmed oper...  \n",
       "4  superbly trashy wondrously unpretentious 80 ex...  "
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 清洗数据添加到dataframe里面\n",
    "df['cleaned_review'] = df.review.apply(clean_sentence)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 5000)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"抽取bag of words 特征（使用sklearn 的CountVectorizer）\"\"\"\n",
    "vectorizer = CountVectorizer(max_features=5000)# 取最大5000个词\n",
    "train_data_features = vectorizer.fit_transform(df.cleaned_review).toarray()# 抽取特征值\n",
    "train_data_features.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "25000\n",
      "<class 'numpy.ndarray'>\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 1, 1, ..., 0, 0, 0],\n",
       "       ...,\n",
       "       [0, 0, 1, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(len(train_data_features))\n",
    "print(type(train_data_features))\n",
    "train_data_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 训练分类器\n",
    "forest = RandomForestClassifier(n_estimators=100) # 构建100棵树\n",
    "forest = forest.fit(train_data_features,df.sentiment)#训练词向量（词向量的特征值）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[12500,     0],\n",
       "       [    0, 12500]], dtype=int64)"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 在训练集上做个predict看看效果\n",
    "confusion_matrix(df.sentiment,forest.predict(train_data_features))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 删除不用的占内存变量\n",
    "del df\n",
    "del train_data_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of reviews: 25000\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>review</th>\n",
       "      <th>cleaned_review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12311_10</td>\n",
       "      <td>Naturally in a film who's main themes are of m...</td>\n",
       "      <td>naturally film main themes mortality nostalgia...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8348_2</td>\n",
       "      <td>This movie is a disaster within a disaster fil...</td>\n",
       "      <td>movie disaster within disaster film full great...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5828_4</td>\n",
       "      <td>All in all, this is a movie for kids. We saw i...</td>\n",
       "      <td>movie kids saw tonight child loved one point k...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7186_2</td>\n",
       "      <td>Afraid of the Dark left me with the impression...</td>\n",
       "      <td>afraid dark left impression several different ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12128_7</td>\n",
       "      <td>A very accurate depiction of small time mob li...</td>\n",
       "      <td>accurate depiction small time mob life filmed ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         id                                             review  \\\n",
       "0  12311_10  Naturally in a film who's main themes are of m...   \n",
       "1    8348_2  This movie is a disaster within a disaster fil...   \n",
       "2    5828_4  All in all, this is a movie for kids. We saw i...   \n",
       "3    7186_2  Afraid of the Dark left me with the impression...   \n",
       "4   12128_7  A very accurate depiction of small time mob li...   \n",
       "\n",
       "                                      cleaned_review  \n",
       "0  naturally film main themes mortality nostalgia...  \n",
       "1  movie disaster within disaster film full great...  \n",
       "2  movie kids saw tonight child loved one point k...  \n",
       "3  afraid dark left impression several different ...  \n",
       "4  accurate depiction small time mob life filmed ...  "
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读取测试数据进行预测\n",
    "# 使用pandas读入训练数据\n",
    "datafile = os.path.join('.','datas','testData.tsv')\n",
    "df = pd.read_csv(datafile,sep=\"\\t\",escapechar=\"\\\\\")\n",
    "print(\"Number of reviews: {}\".format(len(df)))\n",
    "df['cleaned_review'] = df.review.apply(clean_sentence)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 5000)"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data_features = vectorizer.fit_transform(df.cleaned_review).toarray()# 提取特征值\n",
    "test_data_features.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12311_10</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8348_2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5828_4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7186_2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12128_7</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         id  sentiment\n",
       "0  12311_10          1\n",
       "1    8348_2          0\n",
       "2    5828_4          0\n",
       "3    7186_2          1\n",
       "4   12128_7          0"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = forest.predict(test_data_features)\n",
    "output = pd.DataFrame({'id':df.id,'sentiment':result})\n",
    "output.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
